##Set directory
setwd("~/Dropbox/PoliticalScienceTopics/replication")
#clean history
rm(list=ls())

#load libraries

library(DBI)
library(RSQLite)
library(plyr)
library(stringr)
library(reshape2)
library(dplyr)
#show numbers instead of e
options("scipen" = 10)
options()$scipen



#######################
#get access to database
###################
# setting up a driver
drv <- dbDriver("SQLite")
# connect to the database
con <- dbConnect(drv, dbname = "jstor_abstracts_replication.db")
# list the database tables
dbListTables(con) #names of table of interest: articles
#list the contents of a table
names(dbReadTable(con, "articles"))


################################################
###for info

#count abstracts per journal/year
noartbyjour  <- dbGetQuery(con, "SELECT year, journal, COUNT (*) AS countarticle 
                           FROM articles GROUP by year, journal")
unique(noartbyjour$journal)

#check range of jornal coverage
range(noartbyjour[noartbyjour$journal=="The Journal of Politics",]$year) #starts 82
range(noartbyjour[noartbyjour$journal=="American Political Science Review",]$year)#starts52
range(noartbyjour[noartbyjour$journal=="American Journal of Political Science",]$year)#starts 73


unique(noartbyjour[noartbyjour$journal=="American Political Science Review",]$year)




########################


#get overall number of articles by year >1981 because JoP starts 82
no.art <- dbGetQuery(con, "SELECT year, COUNT (*) AS countarticle 
                     FROM articles 
                     WHERE (journal = 'American Journal of Political Science' OR
                     journal =  'American Political Science Review' OR
                     journal =  'The Journal of Politics') AND
                     (year > 1981 AND year < 2015)
                     GROUP by year")


###no. abstracts JOP
no.abstractsjob <-dbGetQuery(con, "SELECT journal, COUNT (*) AS countabstracts 
                             FROM articles 
                             WHERE (
                             journal =  'The Journal of Politics') AND
                             (year > 1981 AND year <= 2014)
                             GROUP by journal")

###no. abstracts APSR and AJPS
no.abstractsAA <-dbGetQuery(con, "SELECT journal, COUNT (*) AS countabstracts 
                            FROM articles 
                            WHERE (journal = 'American Journal of Political Science' OR
                            journal =  'American Political Science Review' ) AND
                            (year > 1976 AND year <= 2014)
                            GROUP by journal")
###sum abstracts
sum(no.abstractsAA$countabstracts)+no.abstractsjob$countabstracts
##################################################################
###retrieve database
database <-   dbGetQuery(con,"SELECT * FROM articles")
names(database)
########################################################
#one variable for each positive word: * number of abstracts containing this word
##UoA count of each word per absract

#function to count positive or negative words in abstract
find_patterns <- function(string, pattern) {
  tmp <- lapply(pattern, function(pattern, string){str_count(string, pattern)}, string)
  tmp <- as.data.frame(t(do.call(rbind, tmp))) 
  names(tmp) <- pattern
  tmp
}

positivewords <- c("amazing", "assuring", "astonishing", "bright", "creative", "encouraging", "enormous", "excellent", 
                   "favorable", "groundbreaking", "hopeful", "innovative", "inspiring", "inventive", "novel", "phenomenal", 
                   "prominent", "promising", "reassuring", "remarkable", "robust", "spectacular", "supportive", "unique", "unprecedented")

####matricx that counts how often each of the words occurs in an abstract
p_byword <-cbind(database,find_patterns(database$abstract,positivewords))
names(p_byword)
p_byword$sumpositive<- rowSums(p_byword[,9:33])



#same procedure for negative words
# "mediocre" not used
negativewords <- c("detrimental",  "disappointing",  "disconcerting",  "discouraging",  "disheartening",  "disturbing",  "frustrating",  
                   "futile",  "hopeless",  "impossible",  "inadequate",  "ineffective",    "insufficient",  "irrelevant",  
                   "low-quality",   "pessimistic",  "substandard",  "unacceptable",  "unpromising",  "unsatisfacty",  "unsatisfying",  
                   "useless",  "weak",  "worrisome")


n_byword <-find_patterns(database$abstract,negativewords)
names(n_byword)
n_byword$sumnegative<- rowSums(n_byword)
#####bind both matrices together
word_count <- cbind(p_byword,n_byword)
names(word_count)


#######create a dummy variable when a positive word was used at least once
#####for UoA each word at least once per abstract
names(word_count)
word_dummy <- as.data.frame(apply(word_count[,c(positivewords, negativewords)],2,function(x) {ifelse(x>0,1,0)}))
##rename variables

for (i in 1:length(names(word_dummy))){
  
  names(word_dummy)[i] <- paste(names(word_dummy)[i],"dum",sep="_")
  
}

###rbind with information on abstracts etc.
names(word_count[,1:8])
word_dummy <- cbind(word_count[,1:8],word_dummy)

names(word_dummy)
###create vector with variable names
pos_dum <- paste(positivewords,"dum",sep="_")
neg_dum <- paste(negativewords,"dum",sep="_")
names(word_dummy[,pos_dum])
###rowSums ofr positive words
word_dummy$sumpositive_dum <- rowSums(word_dummy[,pos_dum])

names(word_dummy)
###rowSums ofr negative words
word_dummy$sumnegative_dum <- rowSums(word_dummy[,neg_dum])
####################################################################
#####aggregate to yearly level from 1982 till2014
#######UoA currently at abstract word
#1.subset to relevant cases JOP, APSR, AJPS
datawords <- subset(word_count, 
                    (year > 1981 & year < 2015))
novel_count <- ddply(datawords, .(year), summarise, count=(sum(novel)))
names(datawords)

#2.aggregate to year level
###wordcount_year contains information on sum of words by year: thus, 
#if in one abstract 3 three time amazing, then it is counted 3 times
#* number of times each positive word occurs---each column with name of word
names(datawords)
# numcolwise() to run a summary over all numeric columns to sum by year over all words
wordcount_year <- ddply(datawords[,c("year","wordcounts", positivewords,negativewords)], .(year), numcolwise(sum))
names(wordcount_year)
#3. get relative frequency

####kick out those words which are not or only very rarely used: less then 6 times
wordcount_year <- wordcount_year[, which(colSums(wordcount_year) >5)]
names(wordcount_year)

###create table with positive and negative words
tpos <- names(wordcount_year)[3:17]
tneg <- names(wordcount_year)[18:25]
tneg <- c(tneg,"","","","","","","")
tab1 <- as.data.frame(t(rbind(tpos,tneg)))
names(tab1) <- c("Positive words", "Negative words")

### share of all  positive/negative words by year 
#create vector with reminaing pos/neg words
pos_6 <- names(wordcount_year[,3:17])
neg_6 <- names(wordcount_year[,18:25])
#sum all pos words
wordcount_year$sumpositive <- rowSums(wordcount_year[,pos_6])
wordcount_year$sharepositive <- wordcount_year$sumpositive / 
  wordcount_year$wordcounts
#sum neg words
wordcount_year$sumnegative <- rowSums(wordcount_year[,neg_6])
wordcount_year$sharenegative <- wordcount_year$sumnegative/ 
  wordcount_year$wordcounts

csv_wordcount_year <- wordcount_year[,c("year","sharepositive","sharenegative")]




######################################
####for each word count how often at least once in abstract
#1.subset to relevant time period
datawords_dummy <- subset(word_dummy, 
                          (year > 1981 & year < 2015))

#2.aggregate to yearly level
###worddummy_year contains information on yearly sum a word showed up at least once in an abstract 
#if in one abstract 3 three time amazing, then it is counted 1 times in that abstract
#* one variable for each positive word: * number of abstracts containing this word
names(datawords_dummy)

worddummy_year <- ddply(datawords_dummy[,c("year","wordcounts",pos_dum,neg_dum)], .(year), numcolwise(sum))

#3. get relative frequency

####kick out those words which are not or only very rarely used: <6

worddummy_year <- worddummy_year[, which(colSums(worddummy_year) >5)]
names(worddummy_year)
#create vector with relevant names
pos_dum6 <- names(worddummy_year[,3:15])
neg_dum6 <- names(worddummy_year[,16:23])
#4. merge with no. articles JOP, APSR, AJPS

worddummy_year <- merge(worddummy_year,no.art, by="year")
names(worddummy_year)


######create yearly share of count of each positive word 
names(worddummy_year)
#rel. frequency dived by no. of abstracts
worddummy_year_eachword <- as.data.frame(apply(worddummy_year[,c(pos_dum6,neg_dum6)],
                                               MARGIN=2, function(x) 
                                               {x/worddummy_year$countarticle}))

worddummy_year_eachword <- cbind(worddummy_year[,1:2],worddummy_year_eachword)
names(worddummy_year_eachword)






#########################################################
#UoA abstract
#number of abstracts with at least one positive/negative word by year for all 3 journals
#regardless which positive word

############################################################
#positive words NOT taken from list from Vinkers et al:
#significant, amazing, astonishing, bright, groundbreaking,
#hopeful, inventive, inspiring, phenomenal, reassuring, spectacular
#because only occurred 5 times or less in time between 1982 and 2014

############################################################

pos_abstract <- dbGetQuery(con,"SELECT  year, COUNT(*)  as positive_abs
                           FROM articles 
                           WHERE (
                           abstract LIKE '%assuring%' OR
                           abstract LIKE '%creative%' OR
                           abstract LIKE '%encouraging%' OR 
                           abstract LIKE '%enormous%'OR 
                           abstract LIKE '%excellent%'OR 
                           abstract LIKE '%favorable%'OR 
                           abstract LIKE '%innovative%'OR 
                           abstract LIKE '%novel%' OR 
                           abstract LIKE '%prominent%' OR
                           abstract LIKE '%promising%' OR 
                           abstract LIKE '%remarkable%'OR 
                           abstract LIKE '%robust%' OR
                           abstract LIKE '%supportive%'OR
                           abstract LIKE '%unique%'OR 
                           abstract LIKE '%unprecedented%') AND
                           (year > 1981 AND year < 2015) 
                           GROUP by year") 

############################################################
#negative words NOT taken from list from Vinkers et al:
#insignificant, disappointing, disconcerting, discouraging,
#disheartening, disturbing, frustrating, futile, hopeless,
#low-quality, substandard, unpromising, unsatisfacty, unsatisfying,
#useless, worrisome, unacceptable

##############################################################
neg_abstract <- dbGetQuery(con,"SELECT  year, COUNT(*)  as negative_abs
                           FROM articles 
                           WHERE (abstract LIKE '%detrimental%' OR
                           abstract LIKE '%impossible%' OR 
                           abstract LIKE '%inadequate%' OR 
                           abstract LIKE '%ineffective%' OR 
                           abstract LIKE '%insufficient%' OR 
                           abstract LIKE '%irrelevant%' OR
                           abstract LIKE '%pessimistic%' OR 
                           abstract LIKE '%weak%' ) AND
                           (year > 1981 AND year < 2015)
                           Group by year ")

all_abstract <- merge(no.art, pos_abstract, by="year", all.x=T)

all_abstract <- merge(all_abstract, neg_abstract, by="year", all.x=T)
###share number of anstracts with pos./neg. words
all_abstract$rel_pos_abs <- all_abstract$positive_abs   / all_abstract$countarticle
all_abstract$rel_neg_abs <- all_abstract$negative_abs / all_abstract$countarticle

##########################
#save for data
csv_main<- merge(all_abstract,csv_wordcount_year, by="year" )
csv_main<- merge(csv_main,worddummy_year_eachword, by="year" )
write.csv(csv_main, file = "csv_main.csv", row.names = FALSE)

################################################
#########################################
#Robustness
################look only at apsr and ajps

no.art_aa <- dbGetQuery(con, "SELECT year, COUNT (*) AS countarticle 
                        FROM articles 
                        WHERE (journal = 'American Journal of Political Science' OR
                        journal =  'American Political Science Review') AND
                        (year > 1976 AND year < 2015)
                        GROUP by year")


###########################################################
####counts how often each of the words occurs in an abstract
####only apsr and ajps


#####aggregate to yearly level and only look at the two big journals

#1.subset to relevant cases  APSR, AJPS
datawords_aa <- subset(word_count, (journal == "American Political Science Review" | 
                                      journal == "American Journal of Political Science" ) &
                         (year > 1976 & year < 2015))
#2.aggregate to year level
###wordcount_year contains information on sum of words by year: thus, 
#if in one abstract 3 three time amazing, then it is counted 3 times
#* number of times each positive word occurs---each colum with name of word
names(datawords_aa)
wordcount_year_aa <- ddply(datawords_aa[,c("year","wordcounts",positivewords,negativewords)], .(year), numcolwise(sum))

#3. get relative frequency

####kick out those words which are only rarely used: < 6


wordcount_year_aa <- wordcount_year_aa[, which(colSums(wordcount_year_aa) >5)]
names(wordcount_year_aa)
pos_aa6 <- names(wordcount_year_aa[,3:15])
neg_aa6 <- names(wordcount_year_aa[,16:23])

### share of all  positive/negative words by year 
wordcount_year_aa$sumpositive <- rowSums(wordcount_year_aa[,pos_aa6])
wordcount_year_aa$sharepositive <- wordcount_year_aa$sumpositive / 
  wordcount_year_aa$wordcounts

wordcount_year_aa$sumnegative <- rowSums(wordcount_year_aa[,neg_aa6])
wordcount_year_aa$sharenegative <- wordcount_year_aa$sumnegative/ 
  wordcount_year_aa$wordcounts

csv_wordcount_year_aa <- wordcount_year_aa[,c("year", "sharepositive","sharenegative")]

names(wordcount_year_aa)
tposaa <- names(wordcount_year)[3:15]
tnegaa <- names(wordcount_year)[16:23]
tnegaa <- c(tneg,"","","","","")
tab1aa <- as.data.frame(t(rbind(tpos,tneg)))
names(tab1aa) <- c("Positive words", "Negative words")

#number of abstracts with at least one positive/negative word by year for 2 journals
#each word showed up more than 5 times in period of time

#regardless which positive word
pos_abstract_aa <- dbGetQuery(con,"SELECT  year, COUNT(*)  as positive_abs
                              FROM articles 
                              WHERE (
                              abstract LIKE '%encouraging%' OR 
                              abstract LIKE '%enormous%'OR 
                              abstract LIKE '%favorable%'OR 
                              abstract LIKE '%innovative%'OR 
                              abstract LIKE '%inspiring%'OR 
                              abstract LIKE '%novel%' OR 
                              abstract LIKE '%prominent%' OR
                              abstract LIKE '%promising%' OR 
                              abstract LIKE '%remarkable%'OR 
                              abstract LIKE '%robust%' OR
                              abstract LIKE '%supportive%'OR
                              abstract LIKE '%unique%'OR 
                              abstract LIKE '%unprecedented%') AND
                              (journal = 'American Journal of Political Science' OR
                              journal =  'American Political Science Review') AND
                              (year > 1976 AND year < 2015)
                              GROUP by year") 

neg_abstract_aa <- dbGetQuery(con,"SELECT  year, COUNT(*)  as negative_abs
                              FROM articles 
                              WHERE (abstract LIKE '%detrimental%' OR
                              abstract LIKE '%detrimental_dum%'OR
                              abstract LIKE '%impossible%' OR 
                              abstract LIKE '%inadequate%' OR 
                              abstract LIKE '%ineffective%' OR 
                              abstract LIKE '%insufficient%' OR 
                              abstract LIKE '%irrelevant%' OR
                              abstract LIKE '%pessimistic%' OR 
                              abstract LIKE '%weak%' ) AND
                              (journal = 'American Journal of Political Science' OR
                              journal =  'American Political Science Review') AND
                              (year > 1976 AND year < 2015)
                              Group by year ")

######merge both info together with no. articles

all_abstract_aa <- merge(no.art_aa, pos_abstract_aa, by="year", all.x=T)
all_abstract_aa[is.na(all_abstract_aa$positive_abs),]
###no positive word in 1982
table(all_abstract_aa$positive_abs,useNA = "always")
all_abstract_aa$positive_abs <-ifelse(is.na(all_abstract_aa$positive_abs),
                                      0,all_abstract_aa$positive_abs)
all_abstract_aa <- merge(all_abstract_aa, neg_abstract_aa, by="year", all.x=T)
names(all_abstract_aa)
table(all_abstract_aa$negative_abs,useNA = "always")
all_abstract_aa$negative_abs <-ifelse(is.na(all_abstract_aa$negative_abs),
                                      0,all_abstract_aa$negative_abs)

all_abstract_aa$rel_pos_abs <- all_abstract_aa$positive_abs   / all_abstract_aa$countarticle
all_abstract_aa$rel_neg_abs <- all_abstract_aa$negative_abs / all_abstract_aa$countarticle



csv_robust <- merge(all_abstract_aa,csv_wordcount_year_aa,by="year")

write.csv(csv_robust, file = "csv_robust.csv", row.names = FALSE)
######################









